
rm(list = ls())
getmode <- function(v, na.omit = TRUE){
  if (na.omit == TRUE) {
    v <- na.omit(v)  
  }
  uv <- unique(v)
  tab <- tabulate(match(v, uv))
  out <- uv[tab == max(tab)]
  if(length(out) > 1){
    out <- sample(out,1)
  }
  return(out)
}

library(tidyverse)
setwd("~/Downloads/dataverse_files(3)")
DtoD <- read_csv("Data/DtoDCommitteeMatches3.csv") %>% dplyr::select(-'...1')
DtoR <- read_csv("Data/DtoRCommitteeMatches3.csv") %>% dplyr::select(-'...1')
RtoR <- read_csv("Data/RtoRCommitteeMatches3.csv") %>% dplyr::select(-'...1')
RtoD <- read_csv("Data/RtoDCommitteeMatches3.csv") %>% dplyr::select(-'...1')


names(DtoR) <- c("PrimeIndex", "MatchedIndex", "Sim", "Prime_policy_doc_id", "Prime_party_control", "Matched_policy_doc_id", "Matched_party_control", "Prime_title", "Matched_title")

names(DtoD) <- c("PrimeIndex", "MatchedIndex", "Sim", "Prime_policy_doc_id", "Prime_party_control", "Matched_policy_doc_id", "Matched_party_control", "Prime_title", "Matched_title")
names(RtoD) <- c("PrimeIndex", "MatchedIndex", "Sim", "Prime_policy_doc_id", "Prime_party_control", "Matched_policy_doc_id", "Matched_party_control", "Prime_title", "Matched_title")
names(RtoR) <- c("PrimeIndex", "MatchedIndex", "Sim", "Prime_policy_doc_id", "Prime_party_control", "Matched_policy_doc_id", "Matched_party_control", "Prime_title", "Matched_title")


DtoD$MatchType <- "DD"
DtoR$MatchType <- "DR"
RtoD$MatchType <- "RD"
RtoR$MatchType <- "RR"

doc_pairs <- bind_rows(DtoR, DtoD, RtoD, RtoR)


hearings<- read_csv("Data/CongressDocumentsCitations.csv")

####In matched pairs, do Democrats cite MORE science than Republicans?
hearings_pol_citation_count <- hearings %>% group_by(policy_document_id) %>% summarise(num_refs = length(unique(dois_cited[!is.na(dois_cited)])))
hearings_pol_citation_count <- hearings_pol_citation_count %>% filter(!is.na(policy_document_id))

doc_pairs_citation <- doc_pairs %>% 
  left_join(hearings_pol_citation_count, by = c("Prime_policy_doc_id" = "policy_document_id")) %>% 
  left_join(hearings_pol_citation_count, by = c("Matched_policy_doc_id" = "policy_document_id")) %>%
  rename(Prime_num_refs = num_refs.x, Matched_num_refs = num_refs.y)

DR_t <- t.test(filter(doc_pairs_citation, MatchType == "DR")$Prime_num_refs, filter(doc_pairs_citation, MatchType == "DR")$Matched_num_refs)
DD_t <- t.test(filter(doc_pairs_citation, MatchType == "DD")$Prime_num_refs, filter(doc_pairs_citation, MatchType == "DD")$Matched_num_refs)
RD_t <- t.test(filter(doc_pairs_citation, MatchType == "RD")$Prime_num_refs, filter(doc_pairs_citation, MatchType == "RD")$Matched_num_refs)
RR_t <- t.test(filter(doc_pairs_citation, MatchType == "RR")$Prime_num_refs, filter(doc_pairs_citation, MatchType == "RR")$Matched_num_refs)

library(broom)
library(purrr)

tab <- map_df(list(DR_t, DD_t, RD_t, RR_t), tidy)
tab <- bind_cols("MatchType" = c("DR", "DD", "RD", "RR"), tab)
tab$filter <- "All"


DR_t_top <- t.test(filter(doc_pairs_citation, MatchType == "DR", Sim >.91, Sim < 1)$Prime_num_refs, filter(doc_pairs_citation, MatchType == "DR", Sim >.91, Sim < 1)$Matched_num_refs)
DD_t_top <- t.test(filter(doc_pairs_citation, MatchType == "DD", Sim >.91, Sim < 1)$Prime_num_refs, filter(doc_pairs_citation, MatchType == "DD", Sim >.91, Sim < 1)$Matched_num_refs)
RD_t_top <- t.test(filter(doc_pairs_citation, MatchType == "RD", Sim >.91, Sim < 1)$Prime_num_refs, filter(doc_pairs_citation, MatchType == "RD", Sim >.91, Sim < 1)$Matched_num_refs)
RR_t_top <- t.test(filter(doc_pairs_citation, MatchType == "RR", Sim >.91, Sim < 1)$Prime_num_refs, filter(doc_pairs_citation, MatchType == "RR", Sim >.91, Sim < 1)$Matched_num_refs)

tab_top <- map_df(list(DR_t_top, DD_t_top, RD_t_top, RR_t_top), tidy)
tab_top <- bind_cols("MatchType" = c("DR", "DD", "RD", "RR"), tab_top)
tab_top$filter <- "(.91-1)"

tab_all <- bind_rows(tab, tab_top)

write_csv(tab_all, "Output/TableS22.csv")

doi_id <- read_csv("Data/CongTTDOI.csv") %>% dplyr::select(-`...1`)


doc_pairs <- doc_pairs %>% left_join(dplyr::select(hearings, policy_document_id, dois_cited), by = c("Prime_policy_doc_id" = "policy_document_id")) %>%
  left_join(doi_id, by = c("dois_cited" = "externalids"))
  
doc_pairs <- doc_pairs %>% left_join(dplyr::select(hearings, policy_document_id, dois_cited), by = c("Matched_policy_doc_id" = "policy_document_id")) %>%
  left_join(doi_id, by = c("dois_cited.y" = "externalids"))

doc_pairs <- doc_pairs %>%drop_na() %>% unique()

gc()
cited_paper_embeddings <- read_csv("Data/CongTTPaperEmbeddings3272023.csv")
cited_paper_embeddings <- cited_paper_embeddings %>% dplyr::select(-`...1`)

doc_pairs <- doc_pairs %>% left_join(cited_paper_embeddings, by = c("corpusid.x" = "corpusid"))


doc_pairs <- doc_pairs %>% left_join(cited_paper_embeddings, by = c("corpusid.y" = "corpusid"))

rm(list = c("cited_paper_embeddings"))
gc()

doc_pairs_DRRD <- doc_pairs %>% filter(MatchType %in% c("DR", "RD"), Sim > .91, Sim < 1)

doc_pairs_DRRD_D <- doc_pairs_DRRD %>% dplyr::filter(Prime_party_control == "D")

doc_pairs_DRRD_D1 <- doc_pairs_DRRD_D %>% dplyr::select(Prime_party_control, contains(".x"))
doc_pairs_DRRD_D2 <- doc_pairs_DRRD_D %>% dplyr::select(Matched_party_control, contains(".y"))


doc_pairs_DRRD_R <- doc_pairs_DRRD %>% dplyr::filter(Prime_party_control == "R")

doc_pairs_DRRD_R1 <- doc_pairs_DRRD_R %>% dplyr::select(Prime_party_control, contains(".x"))
doc_pairs_DRRD_R2 <- doc_pairs_DRRD_R %>% dplyr::select(Matched_party_control, contains(".y"))

doc_pairs_DRRD_R1D1 <- bind_rows(doc_pairs_DRRD_D1, doc_pairs_DRRD_R1)
doc_pairs_DRRD_R1D1 <- doc_pairs_DRRD_R1D1 %>% unique()

doc_pairs_DRRD_R2D2 <- bind_rows(doc_pairs_DRRD_D2, doc_pairs_DRRD_R2)
doc_pairs_DRRD_R2D2 <- doc_pairs_DRRD_R2D2 %>% unique()

doc_pairs_DRRD_R2D2$prime <- 0
doc_pairs_DRRD_R1D1$prime <- 1

names(doc_pairs_DRRD_R2D2) <- names(doc_pairs_DRRD_R1D1)
doc_pairs_DRRD_all <- bind_rows(doc_pairs_DRRD_R2D2, doc_pairs_DRRD_R1D1)


library(ClusterR)

cluster_papers <- function(data){
  bsize = nrow(data)/3
  bsize = min(c(1000,bsize))
  
  max_clust = min(nrow(data) - 2, 75)
  
  cluster_dat <- data %>% dplyr::select(contains("dim"))
  if (nrow(cluster_dat) > 50){
    pmk <- Optimal_Clusters_KMeans(as.matrix(cluster_dat), max_clusters = max_clust, verbose = T, criterion = "BIC",
                                   mini_batch_params = list("batch_size" =  bsize, "init_fraction" = .1, "early_stop_iter" = 10), seed = 666)
    
    
    
    num_clust <- which(pmk == min(pmk))
    
    pmk_mini <- MiniBatchKmeans(
      as.matrix(cluster_dat),
      clusters = num_clust,
      batch_size = bsize,
      num_init = 1,
      max_iters = 100,
      init_fraction = 1,
      initializer = "kmeans++",
      early_stop_iter = 10,
      verbose = T,
      CENTROIDS = NULL,
      tol = 1e-04,
      tol_optimal_init = 0.3,
      seed = 1
    )
    
    cluster_assign <- predict_MBatchKMeans(as.matrix(cluster_dat), pmk_mini$centroids, fuzzy = FALSE)
    
    data$cluster <- cluster_assign
  } else {
    data$cluster <- NA
  }
  gc()
  return(data)
  
}
doc_pairs_DRRD_all <- doc_pairs_DRRD_all %>% drop_na()
doc_pairs_DRRD_all <- cluster_papers(doc_pairs_DRRD_all)
colnames(doc_pairs_DRRD_all)
library(RColorBrewer)
library(randomcoloR)

palette <- distinctColorPalette(length(unique(doc_pairs_DRRD_all$cluster)))

doc_pairs_DRRD_all <- doc_pairs_DRRD_all %>% mutate(cluster = fct_rev(fct_reorder(as.factor(cluster), dois_cited.x,  .fun = "n_distinct")))
doc_DR_matched_bar <- doc_pairs_DRRD_all %>% ggplot(aes(x=cluster, fill = as.factor(cluster))) + geom_bar() + facet_wrap(~ Prime_party_control, nrow = 2) + 
  scale_colour_manual(name = "cluster",values = palette) + scale_fill_manual(name = "cluster",values = palette) + theme_void() + theme(legend.position = "none")


chisq.test(table(doc_pairs_DRRD_all$Prime_party_control, doc_pairs_DRRD_all$cluster))

ggsave("Output/FigS22A.pdf", doc_DR_matched_bar, width = 8, height = 4)




library(cramer)


cramer_boots<- function(dat, permute = F){
  if (permute == T) {
    dat$PartyID <- sample(dat$PartyID)
  }
  print("making matrices")
  D_mat <- dat %>% filter(PartyID == "D") %>% sample_n(500, replace = T) %>% dplyr::select(contains("dim")) %>% as.matrix()
  R_mat <- dat %>% filter(PartyID == "R") %>% sample_n(500, replace = T) %>% dplyr::select(contains("dim")) %>% as.matrix()
  print("cramer test")
  ct <- cramer.test(x = D_mat, y = R_mat, just.statistic = T)
  gc()
  return(ct$statistic)
}



gen_cramer_replicates <- function(dat, n_reps = 100) {
  non_permuted_cramers <- replicate(n_reps, cramer_boots(dat))
  
  permuted_cramers <- replicate(n_reps, cramer_boots(dat, permute = T))
  cramers <- bind_cols("permuted_CS" = permuted_cramers, "nonpermuted_CS" = non_permuted_cramers) %>% pivot_longer(cols = 1:2, names_to = "permuted", values_to = "Cramer")
  cramers$FOR <- unique(dat$field_name)
  return(cramers)
}




library(lsa)
#pairwise similarity calculations code preserved
# sci_sim_vec = rep(NA, nrow(doc_pairs))
# for (i in 1:nrow(doc_pairs)){
#   row <- doc_pairs[i,]
#   sci_sim = cosine(as_vector(row[16:783]), as_vector(row[785:1552]))
#   sci_sim_vec[i] <- sci_sim 
# }
# 
# doc_pairs$sci_sim <- sci_sim_vec
# 
# doc_pairs_sims <- doc_pairs %>% dplyr::select(-contains("dim"))


#
doc_pairs_sims <- read_csv("Data/CongressMatchedPairCitationSimilarities.csv")



library(boot)
bootstrap_mean_ci <- function(y, R = 1000) {
  # Bootstrap procedure
  boot_func <- function(data, indices) mean(data[indices])
  bs_result <- boot(y, boot_func, R = R)
  
  mean_value <- mean(y)
  ci_values <- boot.ci(bs_result, type = "perc")$percent[4:5]  # 95% CI
  
  return(c(y = mean_value, ymin = ci_values[1], ymax = ci_values[2]))
}

filter(doc_pairs_sims, is.finite(sci_sim), Sim <1) %>% dplyr::select(PrimeIndex, MatchedIndex) %>% unique()
doc_pairs_sims <- doc_pairs_sims %>% mutate(MatchType = fct_relevel(MatchType, "DD", "DR", "RR", "RD"))

top_pairs_avg_max_sim_plot <- filter(doc_pairs_sims, is.finite(sci_sim), Sim <1, Sim > .91) %>%
  group_by(Prime_policy_doc_id, Matched_policy_doc_id, dois_cited.x, MatchType) %>% summarise(max_sci_sim = max(sci_sim)) %>%
  ggplot(aes(x = MatchType, y = max_sci_sim)) +
  stat_summary(fun.data = bootstrap_mean_ci, geom = "point", size = 3) +
  stat_summary(fun.data = bootstrap_mean_ci, geom = "errorbar", width = 0) +
  theme_minimal() + xlab("Partisan origin of matched policy documents") + ylab("Average pairwise similarity of most similar citations")

ggsave("Output/FigS23A.pdf", top_pairs_avg_max_sim_plot, width = 4, height = 4)

pairs_avg_max_sim_plot <- filter(doc_pairs_sims, is.finite(sci_sim), Sim <1) %>%
  group_by(Prime_policy_doc_id, Matched_policy_doc_id, dois_cited.x, MatchType) %>% summarise(max_sci_sim = max(sci_sim)) %>%
  ggplot(aes(x = MatchType, y = max_sci_sim)) +
  stat_summary(fun.data = bootstrap_mean_ci, geom = "point", size = 3) +
  stat_summary(fun.data = bootstrap_mean_ci, geom = "errorbar", width = 0) +
  theme_minimal() + xlab("Partisan origin of matched policy documents") + ylab("Average pairwise similarity of most similar citations")

ggsave("Output/FigS24A.pdf", pairs_avg_max_sim_plot, width = 4, height = 4)



max_sci_sim_dat <- filter(doc_pairs_sims, is.finite(sci_sim)) %>%
  group_by(Prime_policy_doc_id, Matched_policy_doc_id, dois_cited.x, MatchType, Sim) %>% summarise(max_sci_sim = max(sci_sim))

mod_sci_sim1 <- lm(max_sci_sim ~ MatchType + Sim, max_sci_sim_dat)
summary(mod_sci_sim1)
predictions(mod_sci_sim1, by = "MatchType")

mod_sci_sim2 <- lm(max_sci_sim ~ MatchType + Sim, filter(max_sci_sim_dat, Sim < 1))
summary(mod_sci_sim2)
predictions(mod_sci_sim2, by = "MatchType")


mod_sci_sim3 <- lm(max_sci_sim ~ MatchType + Sim, filter(max_sci_sim_dat, Sim < 1, Sim > .91))
summary(mod_sci_sim3)
predictions(mod_sci_sim3, by = "MatchType")


library(modelsummary)
modelsummary(models = list("All" = mod_sci_sim1, "Doc Sim in [0,1)" = mod_sci_sim2, "Doc Sim in (.91,1)" = mod_sci_sim3), output = "Output/TableS29.docx", vcov = "HC3", estimate = "{estimate} ({std.error}){stars}")



pair_doi_coints <- doc_pairs_sims %>% group_by(Prime_policy_doc_id, Matched_policy_doc_id) %>% summarise(num_prime_cites = length(unique(dois_cited.x)),
                                                                                                   num_matched_cites = length(unique(dois_cited.y)))
pair_doi_coints

doc_pairs_sims <- doc_pairs_sims %>% left_join(pair_doi_coints)

potential_case_studies <- doc_pairs_sims %>% filter(Sim < 1, Sim >.91, num_prime_cites > 10, num_matched_cites > 10, MatchType %in% c("DR", "RD"))


potential_case_studies <- potential_case_studies %>% filter(Prime_title %in% c("“Balancing Work, Health, and Family: The Case for Expanding the Family and Medical Leave Act.”", 
                                                                               "Medical Experts: Inadequate Federal Approach to Opioid Treatment and the Need to Expand Care",
                                                                               "S.Hrg. 111-942 — THE PROMISE OF HUMAN EMBRYONIC STEM CELL RESEARCH")) %>% unique()


papers <- read_csv("/Users/alexanderfurnas/Dropbox/Projects/PoliticsOfScience/Committes_Science/OvertonCitedPapers.csv")

papers <- papers %>% dplyr::select(doi, title, journal.title, times_cited, altmetric, hit_5) %>% unique()
potential_case_studies <- potential_case_studies %>% left_join(papers, by = c("dois_cited.x" = "doi"))

potential_case_studies <- potential_case_studies %>% left_join(papers, by = c("dois_cited.y" = "doi"))


prime_cites <- potential_case_studies %>% dplyr::select(Prime_title, MatchType, title.x,	dois_cited.x, corpusid.x, journal.title.x,	times_cited.x,	altmetric.x,	hit_5.x) %>% unique()
prime_cites$doc <- "Prime"

names(prime_cites) <- c("Policy_doc_title","MatchType", "paper_title", "doi", "corpusid",	"journal.title",	"times_cited",	"altmetric",	"hit_5", "policy_doc_type")

match_cites <- potential_case_studies %>% dplyr::select(Matched_title, MatchType, title.y, dois_cited.y,	corpusid.y, journal.title.y,	times_cited.y,	altmetric.y,	hit_5.y) %>% unique()
match_cites$doc <- "Match"
names(match_cites) <- c("Policy_doc_title","MatchType", "paper_title", "doi", "corpusid",	"journal.title",	"times_cited",	"altmetric",	"hit_5", "policy_doc_type")

case_docs <- bind_rows(prime_cites,match_cites)

write_csv(case_docs, "Output/PotentialCongressCaseStudies.csv")

case_docs <- read_csv("Output/PotentialCongressCaseStudies.csv")

cited_paper_embeddings <- read_csv("Data/CongTTPaperEmbeddings3272023.csv")
cited_paper_embeddings <- cited_paper_embeddings %>% dplyr::select(-`...1`)

case_docs <- case_docs %>% left_join(cited_paper_embeddings)

case_docs %>% pull(Policy_doc_title) %>% unique()

case_docs$case[case_docs$Policy_doc_title %in% c("“Balancing Work, Health, and Family: The Case for Expanding the Family and Medical Leave Act.”",
                                       "S.Hrg. 115-744 — EXAMINING THE IMPORTANCE OF PAID FAMILY LEAVE FOR AMERICAN WORKING FAMILIES")] <- "Family Leave"


library(Rtsne)
library(purrr)
case_docs_noNA <- case_docs %>%
  filter(complete.cases(select(., contains("dim")))) %>%
  select(c("doi","case", contains("dim"))) %>% unique()

corrected_output <- case_docs_noNA %>%
  split(.$case) %>%
  map(~ Rtsne(as.matrix(select(.x, contains("dim"))), perplexity = 5))

tsne_out <- bind_rows(as_tibble(corrected_output[[1]]$Y))
tsne_out

case_docs_noNA <- case_docs_noNA %>% filter(case == "Family Leave")
case_docs_noNA$V1 <- tsne_out$V1
case_docs_noNA$V2 <- tsne_out$V2
case_docs_noNA <- case_docs_noNA %>% dplyr::select(doi, V1, V2)
case_docs <- case_docs %>% left_join(case_docs_noNA)

library(ggrepel)
case_study_plot <- case_docs %>% ggplot(aes(x=V1, y=V2, color = policy_doc_type, label = paper_title)) + 
  geom_point(alpha = .7, size = 2) + geom_text_repel(size = 1, color = "black") +
  theme_classic() +
  theme(axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        axis.ticks = element_blank(),
        axis.text.x = element_blank(),
        axis.text.y = element_blank(),
        axis.line = element_blank(),
        legend.position = "top") +
  scale_color_manual("Policy Doc", values = c( "#9A3E25", "#156B90"), labels = c("Republican", "Democrat"))# Removes both x and y axis lines

ggsave("Output/FigS21B.pdf", case_study_plot, width = 12, height = 16)





######## Think Tank Matching


rm(list=ls())
getmode <- function(v, na.omit = TRUE){
  if (na.omit == TRUE) {
    v <- na.omit(v)  
  }
  uv <- unique(v)
  tab <- tabulate(match(v, uv))
  out <- uv[tab == max(tab)]
  if(length(out) > 1){
    out <- sample(out,1)
  }
  return(out)
}
library(tidyverse)
setwd("~/Dropbox/Projects/PoliticsOfScience/Committes_Science/ScienceReplication/")


DtoD <- read_csv("Data/LtoLThinkTankMatches3.csv") %>% dplyr::select(-'...1')
DtoR <- read_csv("Data/LtoRThinkTankMatches3.csv") %>% dplyr::select(-'...1')
RtoR <- read_csv("Data/RtoRThinkTankMatches3.csv") %>% dplyr::select(-'...1')
RtoD <- read_csv("Data/RtoLThinkTankMatches3.csv") %>% dplyr::select(-'...1')


names(DtoR) <- c("PrimeIndex", "MatchedIndex", "Sim", "Prime_policy_doc_id", "Prime_party_control", "Matched_policy_doc_id", "Matched_party_control", "Prime_policy_document_title", "Matched_policy_document_title")
names(DtoD) <- c("PrimeIndex", "MatchedIndex", "Sim", "Prime_policy_doc_id", "Prime_party_control", "Matched_policy_doc_id", "Matched_party_control", "Prime_policy_document_title", "Matched_policy_document_title")
names(RtoD) <- c("PrimeIndex", "MatchedIndex", "Sim", "Prime_policy_doc_id", "Prime_party_control", "Matched_policy_doc_id", "Matched_party_control", "Prime_policy_document_title", "Matched_policy_document_title")
names(RtoR) <- c("PrimeIndex", "MatchedIndex", "Sim", "Prime_policy_doc_id", "Prime_party_control", "Matched_policy_doc_id", "Matched_party_control", "Prime_policy_document_title", "Matched_policy_document_title")

DtoD$MatchType <- "LL"
DtoR$MatchType <- "LR"
RtoR$MatchType <- "RR"
RtoD$MatchType <- "RL"


doc_pairs <- bind_rows(DtoR, DtoD, RtoD, RtoR)



hearings<- read_csv("Data/ThinkTankDocumentsCitations.csv")

####In matched pairs, do Democrats cite MORE science than Republicans?
hearings_pol_citation_count <- hearings %>% group_by(policy_document_id) %>% summarise(num_refs = length(unique(dois_cited[!is.na(dois_cited)])))
hearings_pol_citation_count

doc_pairs_citation <- doc_pairs %>% 
  left_join(hearings_pol_citation_count, by = c("Prime_policy_doc_id" = "policy_document_id")) %>% 
  right_join(hearings_pol_citation_count, by = c("Matched_policy_doc_id" = "policy_document_id")) %>%
  rename(Prime_num_refs = num_refs.x, Matched_num_refs = num_refs.y)

LR_t <- t.test(filter(doc_pairs_citation, MatchType == "LR")$Prime_num_refs, filter(doc_pairs_citation, MatchType == "LR")$Matched_num_refs)
LL_t <- t.test(filter(doc_pairs_citation, MatchType == "LL")$Prime_num_refs, filter(doc_pairs_citation, MatchType == "LL")$Matched_num_refs)
RL_t <- t.test(filter(doc_pairs_citation, MatchType == "RL")$Prime_num_refs, filter(doc_pairs_citation, MatchType == "RL")$Matched_num_refs)
RR_t <- t.test(filter(doc_pairs_citation, MatchType == "RR")$Prime_num_refs, filter(doc_pairs_citation, MatchType == "RR")$Matched_num_refs)

LR_t_top <- t.test(filter(doc_pairs_citation, MatchType == "LR", Sim >.91, Sim < 1)$Prime_num_refs, filter(doc_pairs_citation, MatchType == "LR", Sim >.91, Sim < 1)$Matched_num_refs)
LL_t_top <- t.test(filter(doc_pairs_citation, MatchType == "LL", Sim >.91, Sim < 1)$Prime_num_refs, filter(doc_pairs_citation, MatchType == "LL", Sim >.91, Sim < 1)$Matched_num_refs)
RL_t_top <- t.test(filter(doc_pairs_citation, MatchType == "RL", Sim >.91, Sim < 1)$Prime_num_refs, filter(doc_pairs_citation, MatchType == "RL", Sim >.91, Sim < 1)$Matched_num_refs)
RR_t_top <- t.test(filter(doc_pairs_citation, MatchType == "RR", Sim >.91, Sim < 1)$Prime_num_refs, filter(doc_pairs_citation, MatchType == "RR", Sim >.91, Sim < 1)$Matched_num_refs)


tab <- map_df(list(LR_t, LL_t, RL_t, RR_t), tidy)
tab <- bind_cols("MatchType" = c("LR", "LL", "RL", "RR"), tab)
tab$filter <- "All"
tab_top <- map_df(list(LR_t_top, LL_t_top, RL_t_top, RR_t_top), tidy)
tab_top <- bind_cols("MatchType" = c("LR", "LL", "RL", "RR"), tab_top)
tab_top$filter <- "(.91-1)"

tab_all <- bind_rows(tab, tab_top)


library(broom)
library(purrr)


write_csv(tab_all, "Output/TableS28.csv")

doi_id <- read_csv("Data/CongTTDOI.csv") %>% dplyr::select(-`...1`)


doc_pairs <- doc_pairs %>% left_join(dplyr::select(hearings, policy_document_id, dois_cited), by = c("Prime_policy_doc_id" = "policy_document_id")) %>%
  left_join(doi_id, by = c("dois_cited" = "externalids"))

doc_pairs <- doc_pairs %>% left_join(dplyr::select(hearings, policy_document_id, dois_cited), by = c("Matched_policy_doc_id" = "policy_document_id")) %>%
  left_join(doi_id, by = c("dois_cited.y" = "externalids"))

doc_pairs <- doc_pairs %>%drop_na() %>% unique()



gc()
cited_paper_embeddings <- read_csv("Data/CongTTPaperEmbeddings3272023.csv")
cited_paper_embeddings <- cited_paper_embeddings %>% dplyr::select(-`...1`)

doc_pairs <- doc_pairs %>% left_join(cited_paper_embeddings, by = c("corpusid.x" = "corpusid"))


doc_pairs <- doc_pairs %>% left_join(cited_paper_embeddings, by = c("corpusid.y" = "corpusid"))

rm(list = c("cited_paper_embeddings"))
gc()




#retaining similarity calculation for transparency -- code is different than for congress because the data is bigger so needed to be more efficient
# doc1 <- as.matrix(doc_pairs[, 16:783])
# doc2 <- as.matrix(doc_pairs[, 785:1552])
# 
# # Compute the norms for each row
# norms_doc1 <- sqrt(rowSums(doc1^2))
# norms_doc2 <- sqrt(rowSums(doc2^2))
# 
# # Compute the dot products for each pair of rows
# dot_products <- rowSums(doc1 * doc2)
# 
# # Calculate the cosine similarity
# cosine_similarities <- dot_products / (norms_doc1 * norms_doc2)
# 
# # Add the cosine similarities to your data frame
# doc_pairs$sci_sim <- cosine_similarities
# rm(list = c("doc1", "doc2","norms_doc1", "norms_doc2", "dot_products", "cosine_similarities"))
# 
# 
# 
# 
# doc_pairs_sims <- doc_pairs %>% dplyr::select(-contains("dim"))


#load pre-calculated pairwise similarities
doc_pairs_sims <- read_csv("Data/ThinkTankMatchedPairCitationSimilarities.csv")

hearings<- read_csv("Data/ThinkTankDocumentsCitations.csv") %>% 
  dplyr::select(policy_document_id, title.x, policy_source_title,  published_on, Left, Right) %>% unique()




library(boot)
doc_pairs_DRRD <- doc_pairs %>% filter(MatchType %in% c("LR", "RL"), Sim > .91, Sim < 1)

doc_pairs_DRRD_D <- doc_pairs_DRRD %>% dplyr::filter(Prime_party_control == "L")

doc_pairs_DRRD_D1 <- doc_pairs_DRRD_D %>% dplyr::select(Prime_party_control, contains(".x"))
doc_pairs_DRRD_D2 <- doc_pairs_DRRD_D %>% dplyr::select(Matched_party_control, contains(".y"))


doc_pairs_DRRD_R <- doc_pairs_DRRD %>% dplyr::filter(Prime_party_control == "R")

doc_pairs_DRRD_R1 <- doc_pairs_DRRD_R %>% dplyr::select(Prime_party_control, contains(".x"))
doc_pairs_DRRD_R2 <- doc_pairs_DRRD_R %>% dplyr::select(Matched_party_control, contains(".y"))

doc_pairs_DRRD_R1D1 <- bind_rows(doc_pairs_DRRD_D1, doc_pairs_DRRD_R1)
doc_pairs_DRRD_R1D1 <- doc_pairs_DRRD_R1D1 %>% unique()

doc_pairs_DRRD_R2D2 <- bind_rows(doc_pairs_DRRD_D2, doc_pairs_DRRD_R2)
doc_pairs_DRRD_R2D2 <- doc_pairs_DRRD_R2D2 %>% unique()

doc_pairs_DRRD_R2D2$prime <- 0
doc_pairs_DRRD_R1D1$prime <- 1

names(doc_pairs_DRRD_R2D2) <- names(doc_pairs_DRRD_R1D1)
doc_pairs_DRRD_all <- bind_rows(doc_pairs_DRRD_R2D2, doc_pairs_DRRD_R1D1)

gc()

write_csv(doc_pairs_DRRD_all, "Output/TopMatchedThinkTankEmbeddingsForClustering.csv")
library(ClusterR)

cluster_papers <- function(data){
  bsize = nrow(data)/3
  bsize = min(c(1000,bsize))
  
  max_clust = min(nrow(data) - 2, 75)
  
  cluster_dat <- data %>% dplyr::select(contains("dim"))
  if (nrow(cluster_dat) > 50){
    pmk <- Optimal_Clusters_KMeans(as.matrix(cluster_dat), max_clusters = max_clust, verbose = T, criterion = "BIC",
                                   mini_batch_params = list("batch_size" =  bsize, "init_fraction" = .1, "early_stop_iter" = 10), seed = 666)
    
    
    
    num_clust <- which(pmk == min(pmk))
    
    pmk_mini <- MiniBatchKmeans(
      as.matrix(cluster_dat),
      clusters = num_clust,
      batch_size = bsize,
      num_init = 1,
      max_iters = 100,
      init_fraction = 1,
      initializer = "kmeans++",
      early_stop_iter = 10,
      verbose = T,
      CENTROIDS = NULL,
      tol = 1e-04,
      tol_optimal_init = 0.3,
      seed = 1
    )
    
    cluster_assign <- predict_MBatchKMeans(as.matrix(cluster_dat), pmk_mini$centroids, fuzzy = FALSE)
    
    data$cluster <- cluster_assign
  } else {
    data$cluster <- NA
  }
  gc()
  return(data)
  
}
doc_pairs_DRRD_all <- read_csv("Output/TopMatchedThinkTankEmbeddingsForClustering.csv")
doc_pairs_DRRD_all <- doc_pairs_DRRD_all %>% drop_na()
doc_pairs_DRRD_all <- cluster_papers(doc_pairs_DRRD_all)
colnames(doc_pairs_DRRD_all)
library(RColorBrewer)
library(randomcoloR)

palette <- distinctColorPalette(length(unique(doc_pairs_DRRD_all$cluster)))

doc_pairs_DRRD_all <- doc_pairs_DRRD_all %>% mutate(cluster = fct_rev(fct_reorder(as.factor(cluster), dois_cited.x,  .fun = "n_distinct")))
doc_LR_matched_bar <- doc_pairs_DRRD_all %>% ggplot(aes(x=cluster, fill = as.factor(cluster))) + geom_bar() + facet_wrap(~ Prime_party_control, nrow = 2) + 
   scale_colour_manual(name = "cluster",values = palette) + scale_fill_manual(name = "cluster",values = palette) + theme_void() + theme(legend.position = "none")


ggsave("Output/FigS22B.pdf", doc_LR_matched_bar, width = 8, height = 4)

chisq.test(table(doc_pairs_DRRD_all$Prime_party_control, doc_pairs_DRRD_all$cluster))


bootstrap_mean_ci <- function(y, R = 1000) {
  # Bootstrap procedure
  boot_func <- function(data, indices) mean(data[indices])
  bs_result <- boot(y, boot_func, R = R)
  
  mean_value <- mean(y)
  ci_values <- boot.ci(bs_result, type = "perc")$percent[4:5]  # 95% CI
  
  return(c(y = mean_value, ymin = ci_values[1], ymax = ci_values[2]))
}

filter(doc_pairs_sims, is.finite(sci_sim), Sim <1) %>% select(PrimeIndex, MatchedIndex) %>% unique()
# Plot with ggplot2 using stat_summary

doc_pairs_sims <- doc_pairs_sims %>% mutate(MatchType = fct_relevel(MatchType, "LL", "LR", "RR", "RL"))

top_pairs_avg_max_sim_plot <- filter(doc_pairs_sims, is.finite(sci_sim), Sim <1, Sim > .91) %>%
  group_by(Prime_policy_doc_id, Matched_policy_doc_id, dois_cited.x, MatchType) %>% summarise(max_sci_sim = max(sci_sim)) %>%
  ggplot(aes(x = MatchType, y = max_sci_sim)) +
  stat_summary(fun.data = bootstrap_mean_ci, geom = "point", size = 3) +
  stat_summary(fun.data = bootstrap_mean_ci, geom = "errorbar", width = 0) +
  theme_minimal() + xlab("Partisan origin of matched policy documents") + ylab("Average pairwise similarity of most similar citations")

ggsave("Output/FigS23B.pdf", top_pairs_avg_max_sim_plot, width = 4, height = 4)

pairs_avg_max_sim_plot <- filter(doc_pairs_sims, is.finite(sci_sim), Sim <1) %>%
  group_by(Prime_policy_doc_id, Matched_policy_doc_id, dois_cited.x, MatchType) %>% summarise(max_sci_sim = max(sci_sim)) %>%
  ggplot(aes(x = MatchType, y = max_sci_sim)) +
  stat_summary(fun.data = bootstrap_mean_ci, geom = "point", size = 3) +
  stat_summary(fun.data = bootstrap_mean_ci, geom = "errorbar", width = 0) +
  theme_minimal() + xlab("Partisan origin of matched policy documents") + ylab("Average pairwise similarity of most similar citations")

ggsave("Output/FigS24B.pdf", pairs_avg_max_sim_plot, width = 4, height = 4)


library(marginaleffects)
max_sci_sim_dat <- filter(doc_pairs_sims, is.finite(sci_sim)) %>%
  group_by(Prime_policy_doc_id, Matched_policy_doc_id, dois_cited.x, MatchType, Sim) %>% summarise(max_sci_sim = max(sci_sim))

mod_sci_sim1 <- lm(max_sci_sim ~ MatchType + Sim, max_sci_sim_dat)
summary(mod_sci_sim1)
predictions(mod_sci_sim1, by = "MatchType")

mod_sci_sim2 <- lm(max_sci_sim ~ MatchType + Sim, filter(max_sci_sim_dat, Sim < 1))
summary(mod_sci_sim2)
predictions(mod_sci_sim2, by = "MatchType")


mod_sci_sim3 <- lm(max_sci_sim ~ MatchType + Sim, filter(max_sci_sim_dat, Sim < 1, Sim > .91))
summary(mod_sci_sim3)
predictions(mod_sci_sim3, by = "MatchType")


library(modelsummary)
modelsummary(models = list("All" = mod_sci_sim1, "Doc Sim in [0,1)" = mod_sci_sim2, "Doc Sim in (.91,1)" = mod_sci_sim3), output = "Output/TableS30.docx", vcov = "HC3", estimate = "{estimate} ({std.error}){stars}")





pair_doi_coints <- doc_pairs_sims %>% group_by(Prime_policy_doc_id, Matched_policy_doc_id) %>% summarise(num_prime_cites = length(unique(dois_cited.x)),
                                                                                                         num_matched_cites = length(unique(dois_cited.y)))
pair_doi_coints

doc_pairs_sims <- doc_pairs_sims %>% left_join(pair_doi_coints)

potential_case_studies <- doc_pairs_sims %>% filter(Sim < 1, Sim >.91, num_prime_cites > 10, num_matched_cites > 10, MatchType %in% c("LR", "RL"))

potential_case_studies %>% select(Prime_policy_document_title, Matched_policy_document_title) %>% unique() %>% View()

potential_case_studies <- potential_case_studies %>% filter(Prime_policy_document_title %in% c("Effects of Climate Change on Heat- and Cold-Related Mortality: A Literature Review to Inform Updated Estimates of the Social Cost of Carbon", 
                                                                               "The economics of federal tax policy",
                                                                               "Understanding the Implications of Raising the Minimum Wage in the District of Columbia")) %>% unique()


papers <- read_csv("Data/OvertonCitedPapers.csv")

papers <- papers %>% dplyr::select(doi, title, journal.title, times_cited, altmetric, hit_5) %>% unique()
potential_case_studies <- potential_case_studies %>% left_join(papers, by = c("dois_cited.x" = "doi"))

potential_case_studies <- potential_case_studies %>% left_join(papers, by = c("dois_cited.y" = "doi"))


prime_cites <- potential_case_studies %>% dplyr::select(Prime_policy_document_title, MatchType, title.x,	dois_cited.x, corpusid.x, journal.title.x,	times_cited.x,	altmetric.x,	hit_5.x) %>% unique()
prime_cites$doc <- "Prime"

names(prime_cites) <- c("Policy_doc_title","MatchType", "paper_title", "doi", "corpusid",	"journal.title",	"times_cited",	"altmetric",	"hit_5", "policy_doc_type")

match_cites <- potential_case_studies %>% dplyr::select(Matched_policy_document_title, MatchType, title.y, dois_cited.y,	corpusid.y, journal.title.y,	times_cited.y,	altmetric.y,	hit_5.y) %>% unique()
match_cites$doc <- "Match"
names(match_cites) <- c("Policy_doc_title","MatchType", "paper_title", "doi", "corpusid",	"journal.title",	"times_cited",	"altmetric",	"hit_5", "policy_doc_type")

case_docs <- bind_rows(prime_cites,match_cites)

write_csv(case_docs, "Output/PotentialThinkTankCaseStudies.csv")

case_docs <- read_csv("Output/PotentialThinkTankCaseStudies.csv")

cited_paper_embeddings <- read_csv("Data/CongTTPaperEmbeddings3272023.csv")
cited_paper_embeddings <- cited_paper_embeddings %>% dplyr::select(-`...1`)


case_docs <- case_docs %>% left_join(cited_paper_embeddings)

case_docs %>% pull(Policy_doc_title) %>% unique()


case_docs$case[case_docs$Policy_doc_title %in% c("Effects of Climate Change on Heat- and Cold-Related Mortality: A Literature Review to Inform Updated Estimates of the Social Cost of Carbon",
                                                 "The Social Cost of Carbon with Economic and Climate Risk")] <- "Social Cost of Carbon"


case_docs$case[case_docs$Policy_doc_title %in% c("The economics of federal tax policy",
                                                 "The Hidden Cost of Federal Tax Policy")] <- "Federal Tax Policy"


case_docs$case[case_docs$Policy_doc_title %in% c("The Employment and Distributional Effects of Minimum Wage Increases: A Case Study of the State of New York",
                                                 "Understanding the Implications of Raising the Minimum Wage in the District of Columbia")] <- "Minimum Wage"


library(Rtsne)
library(purrr)
case_docs_noNA <- case_docs %>%
  filter(complete.cases(select(., contains("dim")))) %>%
  select(c("doi","case", contains("dim"))) %>% unique()

corrected_output <- case_docs_noNA %>%
  split(.$case) %>%
  map(~ Rtsne(as.matrix(select(.x, contains("dim"))), perplexity = 5))

tsne_out <- bind_rows(as_tibble(corrected_output[[1]]$Y),as_tibble(corrected_output[[2]]$Y),as_tibble(corrected_output[[3]]$Y))

case_docs_noNA$V1 <- tsne_out$V1
case_docs_noNA$V2 <- tsne_out$V2
case_docs_noNA <- case_docs_noNA %>% dplyr::select(doi, V1, V2)
case_docs <- case_docs %>% left_join(case_docs_noNA)

library(ggrepel)
case_study_plot <- case_docs %>% filter(case == "Minimum Wage") %>% ggplot(aes(x=V1, y=V2, color = policy_doc_type, label = paper_title)) + 
  geom_point(alpha = .7, size = 2) + geom_text(size = 1, color = "black") +
  facet_wrap(~case,scales = "free", ncol = 1) + theme_classic() +
  theme(axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        axis.ticks = element_blank(),
        axis.text.x = element_blank(),
        axis.text.y = element_blank(),
        axis.line = element_blank(),
        legend.position = "top") +
  scale_color_manual("Policy Doc", values = c( "#9A3E25", "#156B90"), labels = c("Republican", "Democrat"))# Removes both x and y axis lines

ggsave("Output/FigS21A.pdf", case_study_plot, width = 12, height = 6)











## Matched Overlap
rm(list = ls())
library(tidyverse)
setwd("~/Downloads/dataverse_files(3)")
doc_pairs_sims_cong <- read_csv("Data/CongressMatchedPairCitationSimilarities.csv")

doc_pairs_sims_cong_prime <- doc_pairs_sims_cong %>% dplyr::select(Prime_policy_doc_id, Prime_party_control, dois_cited.x)
names(doc_pairs_sims_cong_prime) <- c("policy_doc_id", "party_control", "doi")

doc_pairs_sims_cong_matched <- doc_pairs_sims_cong %>% dplyr::select(Matched_policy_doc_id, Matched_party_control, dois_cited.y)
names(doc_pairs_sims_cong_matched) <- c("policy_doc_id", "party_control", "doi")

docs_foroverlap_cong <- bind_rows(doc_pairs_sims_cong_matched, doc_pairs_sims_cong_prime)

docs_foroverlap_cong <- docs_foroverlap_cong %>% unique()

num_pcites_cong <- docs_foroverlap_cong %>% group_by(doi) %>% summarise(num_pcites = length(unique(policy_doc_id)))
docs_foroverlap_cong <- docs_foroverlap_cong %>% left_join(num_pcites_cong)


doc_pairs_sims_tt <- read_csv("Data/ThinkTankMatchedPairCitationSimilarities.csv")

doc_pairs_sims_tt_prime <- doc_pairs_sims_tt %>% dplyr::select(Prime_policy_doc_id, Prime_party_control, dois_cited.x)
names(doc_pairs_sims_tt_prime) <- c("policy_doc_id", "party_control", "doi")

doc_pairs_sims_tt_matched <- doc_pairs_sims_tt %>% dplyr::select(Matched_policy_doc_id, Matched_party_control, dois_cited.y)
names(doc_pairs_sims_tt_matched) <- c("policy_doc_id", "party_control", "doi")

docs_foroverlap_tt <- bind_rows(doc_pairs_sims_tt_matched, doc_pairs_sims_tt_prime)

docs_foroverlap_tt <- docs_foroverlap_tt %>% unique()

num_pcites_tt <- docs_foroverlap_tt %>% group_by(doi) %>% summarise(num_pcites = length(unique(policy_doc_id)))
docs_foroverlap_tt <- docs_foroverlap_tt %>% left_join(num_pcites_tt)



permute_party_cong <- function(data){
  data$party_control <- sample(data$party_control) 
  data <- data %>% mutate(dem = case_when(party_control == "D" ~ 1,
                                          TRUE ~ 0),
                          rep = case_when(party_control == "R" ~ 1,
                                          TRUE ~ 0))
  paper_dat <- data %>% group_by(doi, num_pcites) %>%
    summarise(dem = sum(dem), rep = sum(rep))
  
  paper_dat$party_cite <- NA
  paper_dat$party_cite[paper_dat$rep > 0 & paper_dat$dem > 0] <- "Both"
  paper_dat$party_cite[paper_dat$rep == 0 & paper_dat$dem > 0] <- "D"
  paper_dat$party_cite[paper_dat$rep > 0 & paper_dat$dem == 0] <- "R"
  
  return(ungroup(paper_dat))
  
}

observed_party_cong <- function(data){
  data <- data %>% mutate(dem = case_when(party_control == "D" ~ 1,
                                          TRUE ~ 0),
                          rep = case_when(party_control == "R" ~ 1,
                                          TRUE ~ 0))
  paper_dat <- data %>% group_by(doi, num_pcites) %>%
    summarise(dem = sum(dem), rep = sum(rep))
  
  paper_dat$party_cite <- NA
  paper_dat$party_cite[paper_dat$rep > 0 & paper_dat$dem > 0] <- "Both"
  paper_dat$party_cite[paper_dat$rep == 0 & paper_dat$dem > 0] <- "D"
  paper_dat$party_cite[paper_dat$rep > 0 & paper_dat$dem == 0] <- "R"
  return(ungroup(paper_dat))
  
}

permute_party_tt <- function(data){
  data$party_control <- sample(data$party_control) 
  data <- data %>% mutate(Left = case_when(party_control == "L" ~ 1,
                                           TRUE ~ 0),
                          Right = case_when(party_control == "R" ~ 1,
                                            TRUE ~ 0))
  paper_dat <- data %>% group_by(doi, num_pcites) %>%
    summarise(Left = sum(Left), Right = sum(Right))
  
  paper_dat$party_cite <- NA
  paper_dat$party_cite[paper_dat$Right > 0 & paper_dat$Left > 0] <- "Both"
  paper_dat$party_cite[paper_dat$Right == 0 & paper_dat$Left > 0] <- "L"
  paper_dat$party_cite[paper_dat$Right > 0 & paper_dat$Left == 0] <- "R"
  
  return(ungroup(paper_dat))
  
}

observed_party_tt <- function(data){
  data <- data %>% mutate(Left = case_when(party_control == "L" ~ 1,
                                           TRUE ~ 0),
                          Right = case_when(party_control == "R" ~ 1,
                                            TRUE ~ 0))
  paper_dat <- data %>% group_by(doi, num_pcites) %>%
    summarise(Left = sum(Left), Right = sum(Right))
  
  paper_dat$party_cite <- NA
  paper_dat$party_cite[paper_dat$Right > 0 & paper_dat$Left > 0] <- "Both"
  paper_dat$party_cite[paper_dat$Right == 0 & paper_dat$Left > 0] <- "L"
  paper_dat$party_cite[paper_dat$Right > 0 & paper_dat$Left == 0] <- "R"
  
  return(ungroup(paper_dat))
  
}

cong_perm <- replicate(100, permute_party_cong(docs_foroverlap_cong), simplify = F)
tt_perm <- replicate(100, permute_party_tt(docs_foroverlap_tt), simplify = F)


library(purrr)
cong_perm_dist_geq2<- cong_perm %>% map(~dplyr::filter(., num_pcites > 1)) %>% map(~sum(.$party_cite == "Both")/nrow(.)) %>% unlist()
tt_perm_dist_geq2<- tt_perm %>% map(~dplyr::filter(., num_pcites > 1)) %>% map(~sum(.$party_cite == "Both")/nrow(.)) %>% unlist()

cong_perm_dist_geq2 <- tibble("value" = cong_perm_dist_geq2, "institution" = "Congress", "threshold" = 2)
tt_perm_dist_geq2 <- tibble("value" = tt_perm_dist_geq2, "institution" = "Think Tank", "threshold" = 2)


cong_perm_dist_geq1<- cong_perm %>% map(~sum(.$party_cite == "Both")/nrow(.)) %>% unlist()
tt_perm_dist_geq1<- tt_perm  %>% map(~sum(.$party_cite == "Both")/nrow(.)) %>% unlist()

cong_perm_dist_geq1 <- tibble("value" = cong_perm_dist_geq1, "institution" = "Congress", "threshold" = 1)
tt_perm_dist_geq1 <- tibble("value" = tt_perm_dist_geq1, "institution" = "Think Tank", "threshold" = 1)

permuted_overlap_results <- bind_rows(cong_perm_dist_geq2, tt_perm_dist_geq2, cong_perm_dist_geq1, tt_perm_dist_geq1)



cong_observed_geq2 <- observed_party_cong(docs_foroverlap_cong) %>% filter(party_cite == "Both", num_pcites > 1) %>% nrow()/length(unique((filter(docs_foroverlap_cong, num_pcites > 1)$doi)))
tt_observed_geq2 <- observed_party_tt(docs_foroverlap_tt) %>% filter(party_cite == "Both", num_pcites > 1) %>% nrow()/length(unique((filter(docs_foroverlap_tt, num_pcites > 1)$doi)))

cong_observed_geq1 <- observed_party_cong(docs_foroverlap_cong) %>% filter(party_cite == "Both") %>% nrow()/length(unique(docs_foroverlap_cong$doi))
tt_observed_geq1 <- observed_party_tt(docs_foroverlap_tt) %>% filter(party_cite == "Both") %>% nrow()/length(unique(docs_foroverlap_tt$doi))

observed_results <- tibble("value" = c(cong_observed_geq2, tt_observed_geq2, cong_observed_geq1, tt_observed_geq1),
                           "institution" = c("Congress", "Think Tank", "Congress", "Think Tank"), 
                           "threshold" = c(2,2,1,1))


direct_label_dat <- tibble(text = c("Observed share", "Expected share\n(Null model)", "Observed share", "Expected share\n(Null model)","Observed share", "Expected share\n(Null model)","Observed share", "Expected share\n(Null model)"),
                           institution = c("Congress", "Congress","Congress", "Congress", "Think Tank", "Think Tank", "Think Tank", "Think Tank"),
                           value= c(.12, .12, .37, .58, .11, .13, .26, .51), 
                           y = c(.9,.25, .9, .25, .9, .25, .9, .25), 
                           threshold = c(1,1,2,2,1,1,2,2)
)

library(ggdist)
permuted_overlap_chart <- permuted_overlap_results %>%
  ggplot(aes(x=value, color = as.factor(threshold), group = as.factor(threshold))) + 
  stat_slabinterval() +
  facet_wrap(~institution, scales = "free", nrow = 2) + 
  geom_vline(data = observed_results, aes(xintercept = value, color = as.factor(threshold)), linetype = 4) +
  scale_x_continuous("Percent of bipartisan cited papers", labels = scales::percent, limits = c(0,.65)) +
  theme_minimal() + 
  scale_color_brewer("", palette = "Dark2", labels = c("All papers with policy citations", "Papers with 2+ policy citations")) +
  theme(legend.position = "top",
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        axis.title.y = element_blank())  + 
  geom_text(data = direct_label_dat, aes(y = y, label = text), size = 2, fontface = "bold")

ggsave("Output/MatchedPermutedOverlap.pdf", permuted_overlap_chart, width = 7, height = 4)




#### Top Matched Overlap


doc_pairs_sims_cong <- read_csv("Data/CongressMatchedPairCitationSimilarities.csv")

doc_pairs_sims_cong_prime <- doc_pairs_sims_cong %>% filter(Sim <1, Sim >.91) %>% dplyr::select(Prime_policy_doc_id, Prime_party_control, dois_cited.x)
names(doc_pairs_sims_cong_prime) <- c("policy_doc_id", "party_control", "doi")

doc_pairs_sims_cong_matched <- doc_pairs_sims_cong %>% filter(Sim <1, Sim >.91) %>% dplyr::select(Matched_policy_doc_id, Matched_party_control, dois_cited.y)
names(doc_pairs_sims_cong_matched) <- c("policy_doc_id", "party_control", "doi")

docs_foroverlap_cong <- bind_rows(doc_pairs_sims_cong_matched, doc_pairs_sims_cong_prime)

docs_foroverlap_cong <- docs_foroverlap_cong %>% unique()

num_pcites_cong <- docs_foroverlap_cong %>% group_by(doi) %>% summarise(num_pcites = length(unique(policy_doc_id)))
docs_foroverlap_cong <- docs_foroverlap_cong %>% left_join(num_pcites_cong)


doc_pairs_sims_tt <- read_csv("Data/ThinkTankMatchedPairCitationSimilarities.csv")

doc_pairs_sims_tt_prime <- doc_pairs_sims_tt %>% filter(Sim <1, Sim >.91) %>% dplyr::select(Prime_policy_doc_id, Prime_party_control, dois_cited.x)
names(doc_pairs_sims_tt_prime) <- c("policy_doc_id", "party_control", "doi")

doc_pairs_sims_tt_matched <- doc_pairs_sims_tt %>% filter(Sim <1, Sim >.91) %>% dplyr::select(Matched_policy_doc_id, Matched_party_control, dois_cited.y)
names(doc_pairs_sims_tt_matched) <- c("policy_doc_id", "party_control", "doi")

docs_foroverlap_tt <- bind_rows(doc_pairs_sims_tt_matched, doc_pairs_sims_tt_prime)

docs_foroverlap_tt <- docs_foroverlap_tt %>% unique()

num_pcites_tt <- docs_foroverlap_tt %>% group_by(doi) %>% summarise(num_pcites = length(unique(policy_doc_id)))
docs_foroverlap_tt <- docs_foroverlap_tt %>% left_join(num_pcites_tt)



permute_party_cong <- function(data){
  data$party_control <- sample(data$party_control) 
  data <- data %>% mutate(dem = case_when(party_control == "D" ~ 1,
                                          TRUE ~ 0),
                          rep = case_when(party_control == "R" ~ 1,
                                          TRUE ~ 0))
  paper_dat <- data %>% group_by(doi, num_pcites) %>%
    summarise(dem = sum(dem), rep = sum(rep))
  
  paper_dat$party_cite <- NA
  paper_dat$party_cite[paper_dat$rep > 0 & paper_dat$dem > 0] <- "Both"
  paper_dat$party_cite[paper_dat$rep == 0 & paper_dat$dem > 0] <- "D"
  paper_dat$party_cite[paper_dat$rep > 0 & paper_dat$dem == 0] <- "R"
  
  return(ungroup(paper_dat))
  
}

observed_party_cong <- function(data){
  data <- data %>% mutate(dem = case_when(party_control == "D" ~ 1,
                                          TRUE ~ 0),
                          rep = case_when(party_control == "R" ~ 1,
                                          TRUE ~ 0))
  paper_dat <- data %>% group_by(doi, num_pcites) %>%
    summarise(dem = sum(dem), rep = sum(rep))
  
  paper_dat$party_cite <- NA
  paper_dat$party_cite[paper_dat$rep > 0 & paper_dat$dem > 0] <- "Both"
  paper_dat$party_cite[paper_dat$rep == 0 & paper_dat$dem > 0] <- "D"
  paper_dat$party_cite[paper_dat$rep > 0 & paper_dat$dem == 0] <- "R"
  return(ungroup(paper_dat))
  
}

permute_party_tt <- function(data){
  data$party_control <- sample(data$party_control) 
  data <- data %>% mutate(Left = case_when(party_control == "L" ~ 1,
                                           TRUE ~ 0),
                          Right = case_when(party_control == "R" ~ 1,
                                            TRUE ~ 0))
  paper_dat <- data %>% group_by(doi, num_pcites) %>%
    summarise(Left = sum(Left), Right = sum(Right))
  
  paper_dat$party_cite <- NA
  paper_dat$party_cite[paper_dat$Right > 0 & paper_dat$Left > 0] <- "Both"
  paper_dat$party_cite[paper_dat$Right == 0 & paper_dat$Left > 0] <- "L"
  paper_dat$party_cite[paper_dat$Right > 0 & paper_dat$Left == 0] <- "R"
  
  return(ungroup(paper_dat))
  
}

observed_party_tt <- function(data){
  data <- data %>% mutate(Left = case_when(party_control == "L" ~ 1,
                                           TRUE ~ 0),
                          Right = case_when(party_control == "R" ~ 1,
                                            TRUE ~ 0))
  paper_dat <- data %>% group_by(doi, num_pcites) %>%
    summarise(Left = sum(Left), Right = sum(Right))
  
  paper_dat$party_cite <- NA
  paper_dat$party_cite[paper_dat$Right > 0 & paper_dat$Left > 0] <- "Both"
  paper_dat$party_cite[paper_dat$Right == 0 & paper_dat$Left > 0] <- "L"
  paper_dat$party_cite[paper_dat$Right > 0 & paper_dat$Left == 0] <- "R"
  
  return(ungroup(paper_dat))
  
}

cong_perm <- replicate(100, permute_party_cong(docs_foroverlap_cong), simplify = F)
tt_perm <- replicate(100, permute_party_tt(docs_foroverlap_tt), simplify = F)


library(purrr)
cong_perm_dist_geq2<- cong_perm %>% map(~dplyr::filter(., num_pcites > 1)) %>% map(~sum(.$party_cite == "Both")/nrow(.)) %>% unlist()
tt_perm_dist_geq2<- tt_perm %>% map(~dplyr::filter(., num_pcites > 1)) %>% map(~sum(.$party_cite == "Both")/nrow(.)) %>% unlist()

cong_perm_dist_geq2 <- tibble("value" = cong_perm_dist_geq2, "institution" = "Congress", "threshold" = 2)
tt_perm_dist_geq2 <- tibble("value" = tt_perm_dist_geq2, "institution" = "Think Tank", "threshold" = 2)


cong_perm_dist_geq1<- cong_perm %>% map(~sum(.$party_cite == "Both")/nrow(.)) %>% unlist()
tt_perm_dist_geq1<- tt_perm  %>% map(~sum(.$party_cite == "Both")/nrow(.)) %>% unlist()

cong_perm_dist_geq1 <- tibble("value" = cong_perm_dist_geq1, "institution" = "Congress", "threshold" = 1)
tt_perm_dist_geq1 <- tibble("value" = tt_perm_dist_geq1, "institution" = "Think Tank", "threshold" = 1)

permuted_overlap_results <- bind_rows(cong_perm_dist_geq2, tt_perm_dist_geq2, cong_perm_dist_geq1, tt_perm_dist_geq1)



cong_observed_geq2 <- observed_party_cong(docs_foroverlap_cong) %>% filter(party_cite == "Both", num_pcites > 1) %>% nrow()/length(unique((filter(docs_foroverlap_cong, num_pcites > 1)$doi)))
tt_observed_geq2 <- observed_party_tt(docs_foroverlap_tt) %>% filter(party_cite == "Both", num_pcites > 1) %>% nrow()/length(unique((filter(docs_foroverlap_tt, num_pcites > 1)$doi)))

cong_observed_geq1 <- observed_party_cong(docs_foroverlap_cong) %>% filter(party_cite == "Both") %>% nrow()/length(unique(docs_foroverlap_cong$doi))
tt_observed_geq1 <- observed_party_tt(docs_foroverlap_tt) %>% filter(party_cite == "Both") %>% nrow()/length(unique(docs_foroverlap_tt$doi))

observed_results <- tibble("value" = c(cong_observed_geq2, tt_observed_geq2, cong_observed_geq1, tt_observed_geq1),
                           "institution" = c("Congress", "Think Tank", "Congress", "Think Tank"), 
                           "threshold" = c(2,2,1,1))


direct_label_dat <- tibble(text = c("Observed share", "Expected share\n(Null model)", "Observed share", "Expected share\n(Null model)","Observed share", "Expected share\n(Null model)","Observed share", "Expected share\n(Null model)"),
                           institution = c("Congress", "Congress","Congress", "Congress", "Think Tank", "Think Tank", "Think Tank", "Think Tank"),
                           value= c(.12, .12, .37, .58, .11, .13, .26, .51), 
                           y = c(.9,.25, .9, .25, .9, .25, .9, .25), 
                           threshold = c(1,1,2,2,1,1,2,2)
)

library(ggdist)
permuted_overlap_chart <- permuted_overlap_results %>%
  ggplot(aes(x=value, color = as.factor(threshold), group = as.factor(threshold))) + 
  stat_slabinterval() +
  facet_wrap(~institution, scales = "free", nrow = 2) + 
  geom_vline(data = observed_results, aes(xintercept = value, color = as.factor(threshold)), linetype = 4) +
  scale_x_continuous("Percent of bipartisan cited papers", labels = scales::percent, limits = c(0,.65)) +
  theme_minimal() + 
  scale_color_brewer("", palette = "Dark2", labels = c("All papers with policy citations", "Papers with 2+ policy citations")) +
  theme(legend.position = "top",
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        axis.title.y = element_blank())  + 
  geom_text(data = direct_label_dat, aes(y = y, label = text), size = 2, fontface = "bold")

ggsave("Output/FigS21.pdf", permuted_overlap_chart, width = 7, height = 4)

